two afternoon data (reading DEG files) (under construction)

# soil_trt effects (coef=c("SBC_OLD"))
twoafternoon.trtlive.DEGs.all.v3.0anno <- read_csv(file.path("..","output","twoafternoon.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# interaction, coef = str_which(colnames(dge.twoafternoon.design),"soil_trtSBC_OLD:")
twoafternoon.interaction.trtlive.samplingday.lrt.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","twoafternoon.interaction.trtlive.samplingday.lrt.DEGs.all.v3.0anno.csv")) 
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD.sampling_day03 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day04 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day06 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day08 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day10 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day13 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day14 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# any soil_trt effects, coef = str_which(colnames(dge.twoafternoon.design),"soil_trtSBC_OLD")
twoafternoon.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","twoafternoon.any.trtlive.DEGs.all.v3.0anno.csv")) 
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day03 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day04 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day06 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day08 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day10 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day13 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_day14 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )

diel data (reading DEG files)

# treatment only
dge.diurnal34.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal34.trtlive.DEGs.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
dge.diurnal1314.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal1314.trtlive.DEGs.all.v3.0anno.csv")) # updated 
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# treatment and treatment:time interaction
dge.diurnal34.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal34.any.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# interaction (coef = str_which(colnames(dge.diurnal34.design),":"))
dge.diurnal34.interaction.trtlive.time.DEGs.all.v3.0anno<-read_csv(file=file.path("..","output","dge.diurnal34.interaction.trtlive.time.DEGs.all.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# interaction (coef = str_which(colnames(dge.diurnal1314.design),":"))
dge.diurnal1314.interaction.trtlive.time.DEGs.all.v3.0anno<-read_csv(file=file.path("..","output","dge.diurnal1314.interaction.trtlive.time.DEGs.all.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
##   genes = col_character(),
##   logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
##   logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_short_description = col_character(),
##   perc_ID = col_double()
## )
# check
dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dim() # [1] 4593   14 
## [1] 4593   14

format data

# select genes with higher CV 
## classic way
co.var.df <- function(x) ( 100*apply(x,1,sd)/rowMeans(x) )
cpm.timecourse.v3.0$cv<-co.var.df(cpm.timecourse.v3.0[,-1])
# tidyverse way (no working)
#cpm.timecourse.v3.0 %>% slice(1:100) %>% select(-1) %>% group_by(%>% mutate(cv=map(.,co.var.df ))
a<-hist(cpm.timecourse.v3.0$cv)

a
## $breaks
##  [1]   0  50 100 150 200 250 300 350 400 450 500 550 600 650 700
## 
## $counts
##  [1] 19889  5976   989   270    87    45    24    12     7     4     2     1
## [13]     1     1
## 
## $density
##  [1] 1.456643e-02 4.376739e-03 7.243299e-04 1.977443e-04 6.371759e-05
##  [6] 3.295738e-05 1.757727e-05 8.788633e-06 5.126703e-06 2.929544e-06
## [11] 1.464772e-06 7.323861e-07 7.323861e-07 7.323861e-07
## 
## $mids
##  [1]  25  75 125 175 225 275 325 375 425 475 525 575 625 675
## 
## $xname
## [1] "cpm.timecourse.v3.0$cv"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
# there are genes with extream value
cpm.timecourse.v3.0 %>% filter(cv>600)
# Check expression pattern
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes = cpm.timecourse.v3.0 %>% dplyr::filter(cv>450) %>% dplyr::slice(1:20)) ->p
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
p

ggsave(filename="../output/highCV.absvalue.genes.expression.png",width=11,height=8) # should I remove them????
# 
sum(as.integer(cpm.timecourse.v3.0$cv>30))/dim(cpm.timecourse.v3.0)[1] # [1] 0.5207265
## [1] 0.5207265
sum(as.integer(cpm.timecourse.v3.0$cv>40))/dim(cpm.timecourse.v3.0)[1] # [1] 0.3725282. Larger CV than SAS timecourse data ()??? Due to non log absolute expression value.
## [1] 0.3725282
# cf. sum(as.integer(SAS.expression.vst.s.kazu$cv>4.5))/dim(SAS.expression.vst.s.kazu)[1] #[1] 0.2300789

use log transformed data

cpm.timecourse.v3.0.log$cv<-co.var.df(cpm.timecourse.v3.0.log[,-1])
b<-hist(cpm.timecourse.v3.0.log$cv)
b
# use largeCV 
cpm.timecourse.v3.0.log.largeCV<-cpm.timecourse.v3.0.log[cpm.timecourse.v3.0[cpm.timecourse.v3.0$cv>40,"transcript_ID"],] 
dim(cpm.timecourse.v3.0.log.largeCV) # [1] 17262   289  > [1] 10173   290  (02/01/2020) (cf. SAS.expression.vst.s.kazu.largeCV is 7025 288)
c<-hist(cpm.timecourse.v3.0.log.largeCV$cv)
c

###########
#save(cpm.timecourse.v3.0.log.largeCV,file=file.path("..","output","cpm.timecourse.v3.0.log.largeCV.Rdata"))
write_csv(cpm.timecourse.v3.0.log.largeCV,path=file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"))

WGCNA

co-expression analysis by WGCNA

# The following setting is important, do not omit.
library(WGCNA) # errors in installing WGCNA on my computer at impute package installation (Jan 27, 2020). Use Whitney
options(stringsAsFactors = FALSE)
if(Sys.info()["nodename"]=="whitney") {
  enableWGCNAThreads(10) # in Whitney (Maloof lab server) 
} else if (Sys.info()["nodename"]=="Kazu-MBP.plb.ucdavis.edu") {
      enableWGCNAThreads(2) # in my computer
  }

run this in Whitney

#cpm.timecourse.v3.0.log.largeCV<-read_csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"))
# for some reasons in Whitney library columns were read ad character. Needs to fix it.
#cpm.timecourse.v3.0.log.largeCV<-read_csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"),
#                                          col_types=list(col_character(),col_double())) # error
cpm.timecourse.v3.0.log.largeCV<-read.csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz")) # using classic read.csv in Whitney

#load(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.Rdata"))
# 
 datExpr <-t(cpm.timecourse.v3.0.log.largeCV[,-1])
  # Choose a set of soft-thresholding powers
  powers = c(c(1:9), seq(from = 2, to=20, by=10))
  sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)  
  # Plot the results:
  #sizeGrWindow(9, 5)
  pdf("../output/largeCV.softthresholding.pdf",width=10,height=8)
  par(mfrow = c(1,2));
  cex1 = 0.9;
  # Scale-free topology fit index as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
       main = paste("Scale independence"));
  text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       labels=powers,cex=cex1,col="red");
  # this line corresponds to using an R^2 cut-off of h
  abline(h=0.90,col="red")
  # Mean connectivity as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], sft$fitIndices[,5],
       xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
       main = paste("Mean connectivity"))
  text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
  dev.off()
  # 
  net = blockwiseModules(datExpr, power = 9,
                         TOMType = "unsigned", minModuleSize = 20,
                         reassignThreshold = 0, mergeCutHeight = 0.25,
                         numericLabels = TRUE, pamRespectsDendro = FALSE,
                         saveTOMs = TRUE,
                         saveTOMFileBase = "cpm.timecourse.v3.0.log.largeCV.TOM",
                         verbose = 3)
  save(net,file="../output/net.cpm.timecourse.v3.0.log.largeCV.Rdata")  
  # open a graphics window
  pdf(file="../output/largeCV.dendrogram.pdf",width=10,height=8)
  # Convert labels to colors for plotting
  mergedColors = labels2colors(net$colors)
  # Plot the dendrogram and the module colors underneath
  plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                      "Module colors",
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)
  dev.off()
  # save parameters  
  moduleLabels = net$colors
  moduleColors = labels2colors(net$colors)
  MEs = net$MEs
  geneTree = net$dendrograms[[1]]
  save(MEs, moduleLabels, moduleColors, geneTree,file ="../output/all.largeCV.RData")

back to my computer to look WGCNA results

cpm.timecourse.v3.0.log.largeCV<-read.csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz")) 
dim(cpm.timecourse.v3.0.log.largeCV) # [1] 17262   289 -> [1] 10173   290 (Feb 01, 2020)
## [1] 10173   290
load("../output/net.cpm.timecourse.v3.0.log.largeCV.Rdata")  
load("../output/all.largeCV.RData")
# how many modules?
  table(net$colors);length(table(net$colors)) # 7 modules
## 
##    0    1    2    3    4    5    6 
## 4968 4723  174  126   79   72   31
## [1] 7

adding gene name, annotations

cpm.timecourse.v3.0.log.largeCV.modules <- tibble(
  transcript_ID=cpm.timecourse.v3.0.log.largeCV$transcript_ID,
  modules=moduleColors
)
#cpm.timecourse.v3.0.log.largeCV.modules.list<-list(transcript_ID=cpm.timecourse.v3.0.log.largeCV$transcript_ID,modules=moduleColors)
## prep
# annotation file for v3.0annotation
Br.v3.0.At.BLAST <- read_csv(file.path("..","Annotation_copy","output","v3.0annotation","Brapa_v3.0_annotated.csv")) 
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   name = col_character(),
##   chrom = col_character(),
##   subject = col_character(),
##   AGI = col_character(),
##   At_symbol = col_character(),
##   At_full_name = col_character(),
##   At_gene_model_type = col_character(),
##   At_short_description = col_character(),
##   At_Curator_summary = col_character(),
##   At_Computational_description = col_character()
## )
## See spec(...) for full column specifications.
# This annotation is redundant with name (Br grene). Eg 
Br.v3.0.At.BLAST %>% filter(name=="BraA01g040570.3C")
# reduce the redundancy (112418)
Br.v3.0anno.At.BLAST.highscore <- Br.v3.0.At.BLAST %>% group_by(name) %>% arrange(desc(score)) %>% dplyr::slice(1)
# function for adding annotation
## get object name https://stackoverflow.com/questions/14577412/how-to-convert-variable-object-name-into-string
myfunc <- function(v1) {
  deparse(substitute(v1))
}
myfunc(foo)
## [1] "foo"
# adding annotation and write_csv adding ".v3.0anno.csv" to the object name.
addAnno<-function(DGE) {temp<-left_join(DGE %>% rownames_to_column(var="genes"),Br.v3.0anno.At.BLAST.highscore,by=c(genes="name")) %>%  dplyr::select(genes,names(DGE),AGI, At_symbol, At_short_description, perc_ID); print(deparse(substitute(DGE)));
write_csv(temp, path=file.path("..","output",paste(deparse(substitute(DGE)),".v3.0anno.csv",sep="")));
return(temp)} 

asign moduleColor to corresponding Br genes

  #Br.v3.0anno.At.BLAST.highscore.list<-list()
  Bra.v3.0_cdna.list<-list()
  #names(Bra.v3.0_cdna.list)<-names(Bra.v3.0_cdna)
names(Bra.v3.0_cdna) %in% cpm.timecourse.v3.0.log.largeCV.modules$transcript_ID

  for(i in 1:length(Bra.v3.0_cdna)) {
    print(paste("i is ",i))
    print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID))
        print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID) %>% dim())
        print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID) %>% dim() ==c(1,1))
temp<-cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==names(Bra.v3.0_cdna)[i]) %>% dplyr::select(transcript_ID)
print(dim(temp)[1]==0)
    if(dim(temp)[1]==0) next else
    #Bra.v3.0_cdna.list[[i]]<-cpm.timecourse.v3.0.log.largeCV.modules[names(Bra.v3.0_cdna)[i],"modules"]
      # input module
Bra.v3.0_cdna.list[[i]]<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID==names(Bra.v3.0_cdna)[i]) %>% dplyr::select(modules) %>% as_vector()
  # iput gene name
    names(Bra.v3.0_cdna.list)[[i]]<-names(Bra.v3.0_cdna)[i]
  }

# clean up Brgo.v3.0_cdna.list
  table(sapply(Bra.v3.0_cdna.list,is.null))
  Bra.v3.0_cdna.list<-Bra.v3.0_cdna.list[!sapply(Bra.v3.0_cdna.list,is.null)]
  table(sapply(Bra.v3.0_cdna.list,is.null))
  
  save(Bra.v3.0_cdna.list,file="../output/Bra.v3.0_cdna.list.Rdata")
 ######### Did not work
# cpm.timecourse.v3.0.log.largeCV.modules %>% nest(transcript_ID) # this is not what I want
# library(purrr)
#cpm.timecourse.v3.0.log.largeCV.modules %>% purrr::transpose() 

ORA analysis of DEGs

# loading module info as custom categories compatible with goseq()
load("../output/Bra.v3.0_cdna.list.Rdata")
# GOseq
library(ShortRead);library(goseq);library(GO.db);library("annotate")
# for ggplot heatmap
## uncompress gz file
system(paste("gunzip -c ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.gz")," > ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")))
## read cDNA fasta file 
Bra.v3.0_cdna<-readDNAStringSet(file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")) # copied from /Volumes/data_work/Data8/NGS_related/Brassica_rapa_Upendra/G3
Bra.v3.0_cdna
## remove fasta file
system(paste("rm ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa"),sep=""))
# special funciton for GOseq
GOseq.customcategory.ORA<-function(genelist,padjust=0.05,custom.category.list=Bra.v3.0_cdna.list,Br_cdna=Bra.v3.0_cdna) { # return GO enrichment table, padjus, padjust=0.05. 
  
  bias<-nchar(Br_cdna)
  names(bias)<-names(Br_cdna)
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  #GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
  GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
  if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    return(enriched.GO)
  }
}
gene.up<-twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(logFC>0&FDR<0.05) %>% dplyr::select(genes) %>% as_vector()
gene.down<-twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(logFC<0&FDR<0.05) %>% dplyr::select(genes) %>% as_vector()

enriched.GO.up<-GOseq.customcategory.ORA(genelist=gene.up) # needs to wait for Bra.v3.0_cdna.list.Rdata ready in Whitney
## Error in GOseq.customcategory.ORA(genelist = gene.up): could not find function "GOseq.customcategory.ORA"
enriched.GO.down<-GOseq.customcategory.ORA(genelist=gene.down)
## Error in GOseq.customcategory.ORA(genelist = gene.down): could not find function "GOseq.customcategory.ORA"

expression pattern of module/genes of interest (logFC of soil treatment)

n<-1
gene.up.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.up,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes=gene.up.category[1:10,]) # works
## Error in transcript_ID %in% target.genes$transcript_ID: object 'gene.up.category' not found

expression pattern of module/genes of interest (normalized value)

# scaling expression data
cpm.timecourse.v3.0.scale<-t(scale(t(cpm.timecourse.v3.0[,-1]))) %>% as_tibble() %>% bind_cols(data.frame(transcript_ID=cpm.timecourse.v3.0$transcript_ID[]),.)

gene.up.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.up,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
gene.down.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.down,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(data=cpm.timecourse.v3.0.scale,target.genes=gene.up.category[1,]) 
## Error in transcript_ID %in% target.genes$transcript_ID: object 'gene.up.category' not found

using map-column method??? (under construction)

input<-tribble(
  ~target.genes,~data,~f,
  gene.up.category[1:10,],cpm.timecourse.v3.0.scale,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,
  gene.down.category[1:10,],cpm.timecourse.v3.0.scale,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2
)
input2<-tribble(
  ~f,~param,
  expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,list(target.genes=gene.up.category[1:10,],data=cpm.timecourse.v3.0.scale),
  expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,list(target.genes=gene.down.category[1:10,],data=cpm.timecourse.v3.0.scale)
)

test<-input2 %>% mutate(output=invoke_map(f,param)) # works, but parameters are not visible

# how about to use map2?
## an example
params<-tribble(
    ~mean,~sd,~n,
  5,1,1,
  10,5,3,
  -3,10,5
)
params %>% pmap(rnorm) 
# 
input3<-tribble(
  ~target.genes,~data,~title,
  gene.up.category[1:10,],cpm.timecourse.v3.0.scale,"2-afternoon soil up",
  gene.down.category[1:10,],cpm.timecourse.v3.0.scale,"2-afternoon soil down",
)
#input3 %>% pmap(expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2) -> expression.pattern
# 
expression.pattern <- input3 %>% mutate(plot=pmap(.,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2))
expression.pattern$plot[1] # plot
#input3 %>% mutate(plot=invoke_map(~expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2)) # errors

calculate fold change (trial with six genes) (run only once)

temp.abs<-cpm.timecourse.v3.0.log %>% head() %>%
  gather(sample,value,-transcript_ID) %>%
  mutate(abs.value=2^value) %>%
  inner_join(sample.description.timecourse, by="sample") %>% 
  split(.$soil_trt) 
mean.by.soil<-function(x) {group_by(x, group,transcript_ID) %>% summarize(mean=mean(abs.value))}
temp.abs.mean<-temp.abs %>% map(.,mean.by.soil) 
cpm.timecourse.v3.0.logFC<-tibble(transcript_ID=temp.abs.mean[["SBC_OLD"]]$transcript_ID,group=temp.abs.mean[["SBC_OLD"]]$group,logFC=log(temp.abs.mean[["SBC_OLD"]]$mean/temp.abs.mean[["ATM_BLANK"]]$mean)) %>% left_join(sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time"),by="group")
# check logFC range
range(cpm.timecourse.v3.0.logFC$logFC) #(Jan 31, 2020)

calculate fold change (full genes) (run only once)

temp.abs<-cpm.timecourse.v3.0.log  %>%
  gather(sample,value,-transcript_ID) %>% mutate(abs.value=2^value) %>%
  inner_join(sample.description.timecourse, by="sample") %>% 
  split(.$soil_trt) 
# mean of absolute value funciton
mean.by.soil<-function(x) {group_by(x, group,transcript_ID) %>% summarize(mean=mean(abs.value))}
# calculating absolute value mean
temp.abs.mean<-temp.abs %>% map(.,mean.by.soil) 
# making summary tibble
temp2<-sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time")
sample.description.timecourse.logFC<-temp2[!duplicated(temp2),]
# add sample info
cpm.timecourse.v3.0.logFC<-tibble(transcript_ID=temp.abs.mean[["SBC_OLD"]]$transcript_ID,group=temp.abs.mean[["SBC_OLD"]]$group,logFC=log(temp.abs.mean[["SBC_OLD"]]$mean/temp.abs.mean[["ATM_BLANK"]]$mean)) %>% left_join(sample.description.timecourse.logFC,by="group")
# check
dim(cpm.timecourse.v3.0.logFC)
# check frequency distribution
a<-hist(cpm.timecourse.v3.0.logFC$logFC) # most of them are small
a
# what are genes with super high logFC?
high.FC.genes<-cpm.timecourse.v3.0.logFC %>% filter(abs(logFC)>5) %>% dplyr::select(transcript_ID) 
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes = high.FC.genes[1:10,])
#
addAnno2<-function(DGE) {temp<-left_join(DGE,Br.v3.0anno.At.BLAST.highscore,by=c("transcript_ID"="name")) %>%  dplyr::select(transcript_ID,names(DGE),AGI, At_symbol, At_short_description, perc_ID); print(deparse(substitute(DGE)));
write_csv(temp, path=file.path("..","output",paste(deparse(substitute(DGE)),".v3.0anno.csv",sep="")));
return(temp)} 
#
addAnno2(high.FC.genes)
#
dim(cpm.timecourse.v3.0.logFC) #[1] 1110000       5

#write_csv(cpm.timecourse.v3.0.logFC,path="../output/cpm.timecourse.v3.0.logFC.csv") # too large (306 M)
write_csv(cpm.timecourse.v3.0.logFC,path="../output/cpm.timecourse.v3.0.logFC.csv.gz") # 12.3 M

logFC expression pattern (only two afternoon)

cpm.timecourse.v3.0.logFC <-read_csv("../output/cpm.timecourse.v3.0.logFC.csv.gz")
## Parsed with column specification:
## cols(
##   transcript_ID = col_character(),
##   group = col_character(),
##   logFC = col_double(),
##   sampling_day = col_character(),
##   sampling_time = col_character()
## )
target.genes<-gene.up
# expression.pattern.Br.graph.timecourse.v3.0annotation.logFC<-function(data=cpm.timecourse.v3.0.logFC,target.genes,title="",subset.data="only_two_afternoon"){
# #print(paste("data is",data[1:10,]))
# #print(paste("tissue.type is root"))
# data[is.na(data)] <- 0 #
# data.temp<-data  %>% dplyr::filter(transcript_ID %in% target.genes) 
# 
# # if (2-afternoon=TRUE)
# if (subset.data=="only_two_afternoon") {
# p<-data.temp %>% ggplot(aes(x=sampling_day,y=logFC))  + 
#   geom_boxplot(alpha = 0.5)  + 
#   theme_bw() +
#   theme(strip.text.y=element_text(angle=0),axis.text.x=element_text(angle=90)) +
#   theme(legend.position="bottom") + labs(title=title)
# p
# } else {print("Define subset.data other than only_two_afternoon.")}
# }
# test the function
expression.pattern.Br.graph.timecourse.v3.0annotation.logFC(target.genes=gene.up,subset.data="only_two_afternoon")

J meeting (Jan 28, 2020)

K-means clustering

# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.scale.twoafternoon.DEG<-cpm.timecourse.v3.0.scale %>% dplyr::select(-cv) %>% 
  inner_join(twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_time=="2_afternoon")
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# spread
cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread<-cpm.timecourse.v3.0.scale.twoafternoon.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread) # [1] 1442  97
## [1] 1442   97
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1],2,var))
for (i in 2:20) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1],
                                     centers=i,iter.max = 10)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

Let’s perform the actual clsutering using K=8:

set.seed(20)
kClust.8 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.8 <- kClust.8$cluster
# number of clusters
cluster.8.num<-tibble(cluster=kClusters.8) %>% group_by(cluster) %>% summarize(n=sum(cluster)) 
cluster.8.num$cluster<-as.character(cluster.8.num$cluster) # classic way

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

clust.centroid = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}
kClustcentroids.8 <- sapply(levels(factor(kClusters.8)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], kClusters.8)

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.8 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.8.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)]) 
# plot
  p8<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level") 
p8

ggsave(p8,file="../output/Twoafternoon.DEG.Kmean.8clusters.png",width=11,height=8)

Let’s perform the actual clsutering using K=5:

set.seed(20)
kClust.5 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.5 <- kClust.5$cluster
# number of clusters
cluster.5.num<-tibble(cluster=kClusters.5) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.5.num$cluster<-as.character(cluster.5.num$cluster) # classic way

Now we can calculate the cluster ‘cores’ aka centroids:

kClustcentroids.5 <- sapply(levels(factor(kClusters.5)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], kClusters.5)

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.5 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.5.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:480)[!duplicated(.$group.cluster)]) # only cluster 1... why???
# plot
  p5<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level") 
p5

ggsave(p5,file="../output/Twoafternoon.DEG.Kmean.5clusters.png",width=11,height=8)

logFC K-means

expression.pattern.Br.graph.timecourse.v3.0annotation.logFC(target.genes=gene.up,subset.data="only_two_afternoon")

# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.logFC.twoafternoon.DEG<-cpm.timecourse.v3.0.logFC %>% 
  inner_join(twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>% filter(sampling_time=="2_afternoon")
# spread
cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread<-cpm.timecourse.v3.0.logFC.twoafternoon.DEG %>% dplyr::select(transcript_ID,group,logFC) %>% spread(group,logFC,-1)
dim(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread) # [1] 1474  97
## [1] 1442    9
# calculate wss
wss.logFC <- (nrow(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1],2,var))
for (i in 2:20) wss.logFC[i] <- sum(kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1],
                                     centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss.logFC, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

# Let’s perform the actual clsutering using K=5:

set.seed(20)
kClust.logFC.5 <- kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.logFC.5 <- kClust.logFC.5$cluster
# number of clusters
cluster.5.num<-tibble(cluster=kClusters.logFC.5) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.5.num$cluster<-as.character(cluster.5.num$cluster) # classic way

Now we can calculate the cluster ‘cores’ aka centroids:

kClustcentroids.logFC.5 <- sapply(levels(factor(kClusters.logFC.5)), clust.centroid, cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], kClusters.logFC.5)

Plotting the centroids to see how they behave: tidyverse version

# making sample.description.timecourse.logFC
temp2<-sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time")
sample.description.timecourse.logFC<-temp2[!duplicated(temp2),]
# plot
p.logFC.5<-kClustcentroids.logFC.5 %>% as_tibble(rownames="group") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse.logFC,by="group") %>% 
  inner_join(cluster.5.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) ) %>%
  ggplot(aes(x=sampling_day,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") +
  facet_grid(cluster.n~.) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level") 
## Warning: Column `group` joining character vector and factor, coercing into
## character vector
p.logFC.5

ggsave(p.logFC.5,file="../output/Twoafternoon.DEG.logFC.Kmean.5clusters.png",width=11,height=8)

Let’s perform the actual clsutering using K=8:

set.seed(20)
kClust.logFC.8 <- kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.logFC.8 <- kClust.logFC.8$cluster
# number of clusters
cluster.8.num<-tibble(cluster=kClusters.logFC.8) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.8.num$cluster<-as.character(cluster.8.num$cluster) # classic way

Now we can calculate the cluster ‘cores’ aka centroids:

kClustcentroids.logFC.8 <- sapply(levels(factor(kClusters.logFC.8)), clust.centroid, cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], kClusters.logFC.8)

Plotting the centroids to see how they behave: tidyverse version

p.logFC.8<-kClustcentroids.logFC.8 %>% as_tibble(rownames="group") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse.logFC,by="group") %>% 
  inner_join(cluster.8.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) ) %>%
  ggplot(aes(x=sampling_day,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") +
  facet_grid(cluster.n~.) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level") 
## Warning: Column `group` joining character vector and factor, coercing into
## character vector
p.logFC.8

ggsave(p.logFC.8,file="../output/Twoafternoon.DEG.logFC.Kmean.8clusters.png",width=11,height=8)

conclusion

load Brgo.v3.0anno.Atgoslim.BP.list

load(file.path("..","Annotation_copy","output","v3.0annotation","Brgo.v3.0anno.Atgoslim.BP.list.Rdata"))

load GO.ORA function

GOseq function for Brassica rapa (v3.0)

# GOseq
library(ShortRead);library(goseq);library(GO.db);library("annotate")
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:glue':
## 
##     collapse, trim
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## 
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
## 
##     last
## 
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:purrr':
## 
##     compose
## The following object is masked from 'package:tibble':
## 
##     view
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
## 
## Attaching package: 'geneLenDataBase'
## The following object is masked from 'package:S4Vectors':
## 
##     unfactor
## 
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: XML
# for ggplot heatmap
## uncompress gz file
system(paste("gunzip -c ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.gz")," > ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")))
## read cDNA fasta file 
Bra.v3.0_cdna<-readDNAStringSet(file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")) # copied from /Volumes/data_work/Data8/NGS_related/Brassica_rapa_Upendra/G3
Bra.v3.0_cdna
##   A DNAStringSet instance of length 46250
##         width seq                                           names               
##     [1]  1254 ATGCGACCACCGGGTGTTGTT...GAGTCTCTCTTGCTCGCTTAA BraA01g000010.3C
##     [2]  1668 ATGCCAGCAATGCATGCCGTT...AGATGGATCACAAAAGATTAA BraA01g000020.3C
##     [3]   957 ATGATGCTTCTCGTTCATACC...AACTTGGAGTTCCCTGAGTGA BraA01g000030.3C
##     [4]  1299 ATGAGTCGTCTTCTCCTTGCT...GGGTCACGAGATGAGCTATAA BraA01g000040.3C
##     [5]   774 ATGGATTCTGGGCTTCAGCAT...GGAAAGCAGTTCCTTTCGTGA BraA01g000050.3C
##     ...   ... ...
## [46246]   162 ATGCGTCCGTCCTCAGCTCCC...TCTTTGGTGGTCCGGTTCTAA BraAnng001840.3C
## [46247]  1455 ATGTCTAATCAAGGATCAGGA...ACAGGTTTGTTTAGGTGCTAA BraAnng001850.3C
## [46248]  1011 ATGGACAACGTAATTCTGAAA...TCAGGGAAGAAAAGCCCCTGA BraAnng006150.3C
## [46249]   870 ATGTTTCCAAGACGTACAAGG...AGCAGTTGTCCTTATAGTTAG BraAnng000040.3C
## [46250]  1338 ATGCCGCAACAATACTGGAAC...GGAGAGAACCTTATCTCCTGA BraAnng003440.3C
## remove fasta file
system(paste("rm ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa"),sep=""))
# GOseq function
GOseq.Brgo.v3.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v3.0anno.Atgoslim.BP.list,Br_cdna=Bra.v3.0_cdna) { # return GO enrichment table, padjus, padjust=0.05. 
  
  bias<-nchar(Br_cdna)
  names(bias)<-names(Br_cdna)
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  #GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
  
  #head(GO.pval) 
  if(ontology=="BP") {
    GO.pval2<-subset(GO.pval,ontology=="BP")
  } else if(ontology=="CC") {
    GO.pval2<-subset(GO.pval,ontology=="CC")
  } else {
    GO.pval2<-subset(GO.pval,ontology=="MF")
  }
    
  GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
  if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    
    ## write Term and Definition 
    for(i in 1:dim(enriched.GO)[1]) {
      if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
      enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
      enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
      }
    }
    return(enriched.GO)
  }
}
#
head(Bra.v3.0_cdna)
##   A DNAStringSet instance of length 6
##     width seq                                               names               
## [1]  1254 ATGCGACCACCGGGTGTTGTTTC...CTGAGTCTCTCTTGCTCGCTTAA BraA01g000010.3C
## [2]  1668 ATGCCAGCAATGCATGCCGTTTT...GTAGATGGATCACAAAAGATTAA BraA01g000020.3C
## [3]   957 ATGATGCTTCTCGTTCATACCCG...GGAACTTGGAGTTCCCTGAGTGA BraA01g000030.3C
## [4]  1299 ATGAGTCGTCTTCTCCTTGCTCA...GTGGGTCACGAGATGAGCTATAA BraA01g000040.3C
## [5]   774 ATGGATTCTGGGCTTCAGCATCT...AAGGAAAGCAGTTCCTTTCGTGA BraA01g000050.3C
## [6]  3327 ATGGCGTCCACTCCTCCTCAAAA...GCGGTGGGTTTCAATTTCCTTGA BraA01g000060.3C
# length(bias) # 44239 > 45019 where the bias come from?
#  bias.data vector must have the same length as DEgenes vector!

GO ORA of each cluster

any trt DEGs (two afternoon) clustering and cluster ORAs

K-means clustering

# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG<-cpm.timecourse.v3.0.scale %>% dplyr::select(-cv) %>% 
  inner_join(twoafternoon.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_time=="2_afternoon")
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# spread
cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread<-cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread) # [1] 2178   97
## [1] 2178   97
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1],2,var))
for (i in 2:20) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1],
                                     centers=i,iter.max = 10)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

Let’s perform the actual clsutering using K=5:

set.seed(20)
kClust.any.trtlive.5 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.5 <- kClust.any.trtlive.5$cluster
# number of clusters
cluster.any.trtlive.5.num<-tibble(cluster=kClusters.any.trtlive.5) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.any.trtlive.5.num$cluster<-as.character(cluster.any.trtlive.5.num$cluster) # classic way
cluster.any.trtlive.5.num

Now we can calculate the cluster ‘cores’ aka centroids: # find centroid in cluster

kClustcentroids.any.trtlive.5 <- sapply(levels(factor(kClusters.any.trtlive.5)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.5)
kClustcentroids.any.trtlive.5 %>% head()
##                                   1           2            3           4
## 1a1_q_002_S1_R1_001    -0.370624345 -0.27016316 -0.781291105 -0.09145448
## 1a3_q_004_S3_R1_001     0.586129826 -0.45516870 -0.082928903 -0.31258537
## 1a7_q_007_d8_S7_R1_001  0.603390134 -0.06622119 -0.118731825 -0.06524704
## 1a8_q_008_d8_S8_R1_001  0.513232164 -0.44547237 -0.347272642 -0.23834981
## 1c6_q_028_S22_R1_001   -0.008042207 -0.39487064  0.006183179 -0.18385917
## 1d3_q_038_S27_R1_001    0.423676033 -0.18654295 -0.166417100 -0.15567393
##                                  5
## 1a1_q_002_S1_R1_001     0.48593350
## 1a3_q_004_S3_R1_001    -0.04775671
## 1a7_q_007_d8_S7_R1_001 -0.02718576
## 1a8_q_008_d8_S8_R1_001  0.19646882
## 1c6_q_028_S22_R1_001    0.18640730
## 1d3_q_038_S27_R1_001    0.01866725

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.any.trtlive.5 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.any.trtlive.5.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean 
  ### under construction ####
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)]) 
# plot
p5.any.trtlive<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level") 
p5.any.trtlive

ggsave(p5.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.5clusters.png",width=11,height=6)

Let’s perform the actual clsutering using K=6:

set.seed(20)
kClust.any.trtlive.6 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.6 <- kClust.any.trtlive.6$cluster
# number of clusters
cluster.any.trtlive.6.num<-tibble(cluster=kClusters.any.trtlive.6) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.any.trtlive.6.num$cluster<-as.character(cluster.any.trtlive.6.num$cluster) # classic way
cluster.any.trtlive.6.num

Now we can calculate the cluster ‘cores’ aka centroids: # find centroid in cluster

kClustcentroids.any.trtlive.6 <- sapply(levels(factor(kClusters.any.trtlive.6)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.6)
kClustcentroids.any.trtlive.6 %>% head()
##                                  1          2          3           4
## 1a1_q_002_S1_R1_001     0.65133082 -0.3158853 -0.8226438 -0.37371653
## 1a3_q_004_S3_R1_001    -0.08768173 -0.5162655 -0.1554494  0.63258565
## 1a7_q_007_d8_S7_R1_001 -0.06407782 -0.0306091 -0.2095253  0.68120007
## 1a8_q_008_d8_S8_R1_001  0.31357964 -0.5018963 -0.3659202  0.54455777
## 1c6_q_028_S22_R1_001    0.07730847 -0.4843704  0.1434211 -0.03188877
## 1d3_q_038_S27_R1_001   -0.08124684 -0.1882003 -0.2357184  0.49835252
##                                  5           6
## 1a1_q_002_S1_R1_001    -0.05820563 -0.23794070
## 1a3_q_004_S3_R1_001    -0.22758805 -0.22464750
## 1a7_q_007_d8_S7_R1_001 -0.07040824 -0.03305671
## 1a8_q_008_d8_S8_R1_001 -0.16841988 -0.30044683
## 1c6_q_028_S22_R1_001   -0.12421671 -0.15052035
## 1d3_q_038_S27_R1_001   -0.14962567 -0.05877720

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.any.trtlive.6 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.any.trtlive.6.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean 
  ### under construction ####
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)]) 
# plot
p6.any.trtlive<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): six clusters",color = "Cluster",y="scaled expression level") 
p6.any.trtlive

ggsave(p6.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.6clusters.png",width=11,height=6)

Let’s perform the actual clsutering using K=8:

set.seed(20)
kClust.any.trtlive.8 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.8 <- kClust.any.trtlive.8$cluster
# number of clusters
cluster.any.trtlive.8.num<-tibble(cluster=kClusters.any.trtlive.8) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.any.trtlive.8.num$cluster<-as.character(cluster.any.trtlive.8.num$cluster) # classic way
cluster.any.trtlive.8.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster

clust.centroid = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

kClustcentroids.any.trtlive.8 <- sapply(levels(factor(kClusters.any.trtlive.8)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.8)
kClustcentroids.any.trtlive.8 %>% head()
##                                  1           2           3          4
## 1a1_q_002_S1_R1_001     0.66774332 -0.32765717 -0.04664319 -0.3967766
## 1a3_q_004_S3_R1_001    -0.16465433 -0.35598558 -0.23130395 -0.6495724
## 1a7_q_007_d8_S7_R1_001 -0.09954455  0.07985282 -0.05066739 -0.2320192
## 1a8_q_008_d8_S8_R1_001  0.29726762 -0.41141910 -0.16242202 -0.6164058
## 1c6_q_028_S22_R1_001    0.05698738 -0.44714663 -0.12673854 -0.4659240
## 1d3_q_038_S27_R1_001   -0.06494311 -0.01126279 -0.13568017 -0.4660710
##                                  5          6           7            8
## 1a1_q_002_S1_R1_001    -0.15395817 -0.9249212 -0.35478101 -0.584426385
## 1a3_q_004_S3_R1_001    -0.27426807 -0.4472171  0.58951253  0.598921297
## 1a7_q_007_d8_S7_R1_001 -0.01876720 -0.3831983  0.72398958  0.051023263
## 1a8_q_008_d8_S8_R1_001 -0.28330120 -0.4472883  0.55642867 -0.032406247
## 1c6_q_028_S22_R1_001   -0.11632418  0.3594808 -0.04093214  0.001411183
## 1d3_q_038_S27_R1_001   -0.01812421 -0.2720918  0.51892604 -0.120575363

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.any.trtlive.8 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.any.trtlive.8.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)]) 
# plot
p8.any.trtlive<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level") 
p8.any.trtlive

ggsave(p8.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.8clusters.png",width=11,height=8)

Let’s perform the actual clsutering using K=15:

set.seed(20)
kClust.any.trtlive.15 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.15 <- kClust.any.trtlive.15$cluster
# number of clusters
cluster.any.trtlive.15.num<-tibble(cluster=kClusters.any.trtlive.15) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.any.trtlive.15.num$cluster<-as.character(cluster.any.trtlive.15.num$cluster) # classic way
cluster.any.trtlive.15.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

clust.centroid = function(i, dat, clusters) {
  ind = (clusters == i)
  colMeans(dat[ind,])
}

kClustcentroids.any.trtlive.15 <- sapply(levels(factor(kClusters.any.trtlive.15)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.15)
kClustcentroids.any.trtlive.15 %>% head()
##                                 1            2          3          4
## 1a1_q_002_S1_R1_001     0.5363413 -0.368342751 -0.1930865 -0.9943538
## 1a3_q_004_S3_R1_001    -0.7294333 -0.364642600 -0.7338475 -0.4490430
## 1a7_q_007_d8_S7_R1_001 -0.3427135 -0.002591793 -0.1843309 -0.4124467
## 1a8_q_008_d8_S8_R1_001 -0.2509873 -0.479632975 -0.5871281 -0.4363606
## 1c6_q_028_S22_R1_001   -0.2064792 -0.519653925 -0.4785148  0.4240465
## 1d3_q_038_S27_R1_001   -0.4523775 -0.096021258 -0.3338139 -0.2100288
##                                  5            6          7           8
## 1a1_q_002_S1_R1_001     0.53906726  0.677494289 -0.2143971 -0.53950313
## 1a3_q_004_S3_R1_001     0.04061909  0.087791367 -0.3582435  0.18574802
## 1a7_q_007_d8_S7_R1_001  0.40465292 -0.031086513 -0.1696025  0.02199784
## 1a8_q_008_d8_S8_R1_001  0.40058289  0.358359368 -0.3610603  0.14468767
## 1c6_q_028_S22_R1_001   -0.20400698  0.146627590 -0.1992854  0.49896429
## 1d3_q_038_S27_R1_001    0.11604338 -0.009315873 -0.2560809 -0.17461624
##                                  9          10          11         12
## 1a1_q_002_S1_R1_001    -0.09328228 -0.05176534 -0.34883557 -0.5686172
## 1a3_q_004_S3_R1_001    -0.11607911 -0.12476358  0.94170519  0.6524345
## 1a7_q_007_d8_S7_R1_001 -0.03715975  0.14627723 -0.06335732  0.7831266
## 1a8_q_008_d8_S8_R1_001 -0.08119719 -0.14666264  0.10727360  0.4921936
## 1c6_q_028_S22_R1_001   -0.06907810  0.04886612  0.18259940 -0.2771016
## 1d3_q_038_S27_R1_001   -0.04874525  0.48051250 -0.19548676  0.7062537
##                                 13           14         15
## 1a1_q_002_S1_R1_001    -0.61805695 -0.317859857 -0.5165267
## 1a3_q_004_S3_R1_001     0.06162628 -0.315313752 -0.6740801
## 1a7_q_007_d8_S7_R1_001  0.14337203  0.066701371 -0.0568714
## 1a8_q_008_d8_S8_R1_001 -0.16162715 -0.378009042 -0.6538674
## 1c6_q_028_S22_R1_001   -0.16897390 -0.341682014 -0.4940442
## 1d3_q_038_S27_R1_001   -0.26367077  0.002871791 -0.4328578

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.any.trtlive.15 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.any.trtlive.15.num,by="cluster") %>%
    mutate(cluster.n=glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>%  dplyr::slice(rep(1:1440)[!duplicated(.$group.cluster)]) 
# plot
p15.any.trtlive<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) + 
  geom_point() + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
  facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
  labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): fifteen clusters",color = "Cluster",y="scaled expression level") 
p15.any.trtlive

ggsave(p15.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.15clusters.png",width=11,height=15)

GO ORA of each cluster

# 8 Kmeans cluster (my way using enframe, which I am not satisfied)
temp<-tibble(transcript_ID=cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread$transcript_ID, cluster=kClusters.any.trtlive.8) %>% split(.$cluster) %>% map(function(x) {GOseq.Brgo.v3.0.Atgoslim.BP.list.ORA(genelist=x$transcript_ID)})
# convert list to data.frame
temp %>% enframe(name="cluster") %>% unnest(value) %>% write_csv(path="../output/twoafternoon.any.trtsoil.DEG.Kmeans.8cluster.csv")

Julin’s method

Diurnal any trt DEGS (3/4 days) clustering and cluster ORAs

dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% head()
# scaling expression data
cpm.timecourse.v3.0.scale<-t(scale(t(cpm.timecourse.v3.0[,-1]))) %>% as_tibble() %>% bind_cols(data.frame(transcript_ID=cpm.timecourse.v3.0$transcript_ID[]),.)

# diurnal 3and4 days DEG expression data (scaled)
cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG<-cpm.timecourse.v3.0.scale  %>% 
  inner_join(dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_day %in% c("03","04")) #[1] 4406  121
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
with(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG,table(sampling_day,sample)) # OK
##             sample
## sampling_day 1a1_q_002_S1_R1_001 1a2_q_003_S2_R1_001 1a4_q_005_S4_R1_001
##           03                   0                   0                4406
##           04                4406                4406                   0
##             sample
## sampling_day 1a6_q_007_S6_R1_001 1b2_q_013_S10_R1_001 1b4_q_015_S12_R1_001
##           03                4406                    0                 4406
##           04                   0                 4406                    0
##             sample
## sampling_day 1b5_q_016_S13_R1_001 1b8_q_022_S16_R1_001 1c1_q_023_S17_R1_001
##           03                 4406                    0                 4406
##           04                    0                 4406                    0
##             sample
## sampling_day 1c5_q_027_S21_R1_001 1c7_q_031_S23_R1_001 1c8_q_032_S24_R1_001
##           03                    0                    0                    0
##           04                 4406                 4406                 4406
##             sample
## sampling_day 1d2_q_037_S26_R1_001 1d6_q_044_S30_R1_001 1e2_q_050_S34_R1_001
##           03                    0                 4406                 4406
##           04                 4406                    0                    0
##             sample
## sampling_day 1f2_q_062_S42_R1_001 1f4_q_066_S44_R1_001 1f5_q_068_S45_R1_001
##           03                 4406                    0                 4406
##           04                    0                 4406                    0
##             sample
## sampling_day 1f7_q_071_S47_R1_001 1f8_q_072_S48_R1_001 1g7_q_082_S55_R1_001
##           03                 4406                    0                 4406
##           04                    0                 4406                    0
##             sample
## sampling_day 1h5_q_091_S61_R1_001 1h6_q_095_S62_R1_001 1h7_q_096_S63_R1_001
##           03                 4406                    0                 4406
##           04                    0                 4406                    0
##             sample
## sampling_day 1i1_q_098_S65_R1_001 1i4_q_106_S68_R1_001 1i6_q_108_S70_R1_001
##           03                    0                 4406                    0
##           04                 4406                    0                 4406
##             sample
## sampling_day 1i8_q_111_S72_R1_001 1j1_q_112_S73_R1_001 1j2_q_113_S74_R1_001
##           03                    0                 4406                    0
##           04                 4406                    0                 4406
##             sample
## sampling_day 1j4_q_115_S76_R1_001 1j5_q_116_S77_R1_001 1j6_q_117_S78_R1_001
##           03                    0                 4406                 4406
##           04                 4406                    0                    0
##             sample
## sampling_day 1j8_q_119_S80_R1_001 1k4_q_127_S84_R1_001 1k7_q_134_S87_R1_001
##           03                    0                    0                    0
##           04                 4406                 4406                 4406
##             sample
## sampling_day 1l1_q_136_S89_R1_001 1l4_q_139_S92_R1_001 1l7_q_143_S95_R1_001
##           03                 4406                 4406                 4406
##           04                    0                    0                    0
##             sample
## sampling_day 1l8_q_144_S96_R1_001 2a1_q_146_S97_R1_001 2a4_q_150_S100_R1_001
##           03                    0                 4406                     0
##           04                 4406                    0                  4406
##             sample
## sampling_day 2a5_q_151_S101_R1_001 2a6_q_152_S102_R1_001 2b3_q_160_S107_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 2c7_q_178_S119_R1_001 2c8_q_179_S120_R1_001 2d1_q_180_S121_R1_001
##           03                     0                     0                     0
##           04                  4406                  4406                  4406
##             sample
## sampling_day 2d3_q_182_S123_R1_001 2d5_q_184_S125_R1_001 2e2_q_196_S130_R1_001
##           03                     0                  4406                  4406
##           04                  4406                     0                     0
##             sample
## sampling_day 2e4_q_199_S132_R1_001 2e5_q_200_S133_R1_001 2e7_q_201_S135_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 2e8_q_203_S136_R1_001 2f2_q_205_S138_R1_001 2f4_q_208_S140_R1_001
##           03                  4406                     0                     0
##           04                     0                  4406                  4406
##             sample
## sampling_day 2f6_q_212_S142_R1_001 2f7_q_213_S143_R1_001 2f8_q_216_S144_R1_001
##           03                     0                  4406                  4406
##           04                  4406                     0                     0
##             sample
## sampling_day 2g3_q_220_S147_R1_001 2g5_q_226_S149_R1_001 2g7_q_228_S151_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 2h1_q_230_S153_R1_001 2h5_q_236_S157_R1_001 2h8_q_240_S160_R1_001
##           03                  4406                  4406                  4406
##           04                     0                     0                     0
##             sample
## sampling_day 2i5_q_247_S165_R1_001 2i8_q_249_S168_R1_001 2j4_q_254_S172_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 2j5_q_255_S173_R1_001 2k3_q_266_S179_R1_001 2k5_q_271_S181_R1_001
##           03                     0                  4406                     0
##           04                  4406                     0                  4406
##             sample
## sampling_day 2k6_q_272_S182_R1_001 2k7_q_273_S183_R1_001 2k8_q_275_S184_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 2l1_q_276_S185_R1_001 2l3_q_280_S187_R1_001 2l4_q_282_S188_R1_001
##           03                     0                  4406                     0
##           04                  4406                     0                  4406
##             sample
## sampling_day 2l5_q_285_S189_R1_001 2l6_q_286_S190_R1_001 3a2_q_292_S194_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 3a4_q_294_S196_R1_001 3a6_q_296_S198_R1_001 3b2_q_301_S202_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 3b7_q_307_S207_R1_001 3b8_q_308_S208_R1_001 3c1_q_311_S209_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 3c3_q_314_S211_R1_001 3c4_q_315_S212_R1_001 3c5_q_317_S213_R1_001
##           03                  4406                     0                  4406
##           04                     0                  4406                     0
##             sample
## sampling_day 3c6_q_318_S214_R1_001 3d5_q_330_S221_R1_001 3d7_q_334_S223_R1_001
##           03                  4406                  4406                     0
##           04                     0                     0                  4406
##             sample
## sampling_day 3d8_q_336_S224_R1_001 3e2_q_342_S226_R1_001 3e3_q_343_S227_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 3e4_q_344_S228_R1_001 3e6_q_350_S230_R1_001 3f1_q_353_S233_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 3f3_q_354_S235_R1_001 3g2_q_364_S242_R1_001 3g4_q_366_S244_R1_001
##           03                     0                     0                  4406
##           04                  4406                  4406                     0
##             sample
## sampling_day 3g5_q_367_S245_R1_001 3h7_q_384_S255_R1_001 3i1_q_388_S257_R1_001
##           03                     0                     0                     0
##           04                  4406                  4406                  4406
##             sample
## sampling_day 3i2_q_389_S258_R1_001 3i3_q_391_S259_R1_001 3i6_q_395_S262_R1_001
##           03                  4406                  4406                  4406
##           04                     0                     0                     0
##             sample
## sampling_day 3i7_q_396_S263_R1_001 3j3_q_401_S267_R1_001 3j4_q_403_S268_R1_001
##           03                  4406                     0                     0
##           04                     0                  4406                  4406
##             sample
## sampling_day 3j6_q_407_S270_R1_001 3j7_q_409_S271_R1_001 3k1_q_411_S273_R1_001
##           03                  4406                  4406                     0
##           04                     0                     0                  4406
##             sample
## sampling_day 3k5_q_415_S277_R1_001 3l2_q_423_S282_R1_001 3l3_q_424_S283_R1_001
##           03                     0                  4406                  4406
##           04                  4406                     0                     0
##             sample
## sampling_day 3l4_q_426_S284_R1_001 3l5_q_427_S285_R1_001 3l7_q_429_S287_R1_001
##           03                  4406                     0                     0
##           04                     0                  4406                  4406
# spread"
cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread<-cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread) # [1] 4406  121
## [1] 4406  121
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1],2,var))
for (i in 2:30) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1],
                                     centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

Let’s perform the actual clsutering using K=6:

set.seed(20)
kClust.diurnal34.any.trt.6 <- kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 220300)
kClusters.diurnal34.any.trt.6 <- kClust.diurnal34.any.trt.6$cluster
# number of clusters
cluster.diurnal34.any.trt.6.num<-tibble(cluster=kClusters.diurnal34.any.trt.6) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.diurnal34.any.trt.6.num$cluster<-as.character(cluster.diurnal34.any.trt.6.num$cluster) # classic way
cluster.diurnal34.any.trt.6.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

kClustcentroids.diurnal34.any.trt.6 <- sapply(levels(factor(kClusters.diurnal34.any.trt.6)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], kClusters.diurnal34.any.trt.6)
kClustcentroids.diurnal34.any.trt.6 %>% head()
##                                1            2             3             4
## 1a1_q_002_S1_R1_001   0.15394279  0.574503587 -0.1346771004 -0.4374420358
## 1a2_q_003_S2_R1_001  -0.18939602  0.875668492  0.0141788125  0.0003232468
## 1a4_q_005_S4_R1_001   0.04440172  0.544444068 -0.1535801774 -0.5019607167
## 1a6_q_007_S6_R1_001  -0.48949297 -0.008991221  0.0009558034  0.6395503254
## 1b2_q_013_S10_R1_001 -0.51370767 -0.243401224  0.0986233744  1.3543390190
## 1b4_q_015_S12_R1_001 -0.62238828 -0.316876093  0.0895423127  1.1728175746
##                               5          6
## 1a1_q_002_S1_R1_001  -0.6981031 -0.1141115
## 1a2_q_003_S2_R1_001  -0.7348265  0.1399330
## 1a4_q_005_S4_R1_001  -0.7957776 -0.2638425
## 1a6_q_007_S6_R1_001   0.3361567  0.6853755
## 1b2_q_013_S10_R1_001  0.3719169  0.8161256
## 1b4_q_015_S12_R1_001  0.4386614  0.6863338

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.diurnal34.any.trt.6 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.diurnal34.any.trt.6.num,by="cluster") %>%
    mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>%  dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)]) 
# plot
p6.diurnal34.any.trt<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) + 
  geom_jitter(alpha=0.5,width=0.25) + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
  facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
  labs(title= "K-means clustering of diurnal DEGs (day 3 and 4): six clusters",color = "Cluster",y="scaled expression level") 
p6.diurnal34.any.trt

ggsave(p6.diurnal34.any.trt,file="../output/diurnal34.any.trt.DEG.Kmean.6clusters.png",width=11,height=6)

Let’s perform the actual clsutering using K=15:

set.seed(20)
kClust.diurnal34.any.trt.15 <- kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.diurnal34.any.trt.15 <- kClust.diurnal34.any.trt.15$cluster
# number of clusters
cluster.diurnal34.any.trt.15.num<-tibble(cluster=kClusters.diurnal34.any.trt.15) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.diurnal34.any.trt.15.num$cluster<-as.character(cluster.diurnal34.any.trt.15.num$cluster) # classic way
cluster.diurnal34.any.trt.15.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

kClustcentroids.diurnal34.any.trt.15 <- sapply(levels(factor(kClusters.diurnal34.any.trt.15)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], kClusters.diurnal34.any.trt.15)
kClustcentroids.diurnal34.any.trt.15 %>% head()
##                                1           2            3          4
## 1a1_q_002_S1_R1_001  -0.64961818 -0.02479961 -0.289615400 -0.9786539
## 1a2_q_003_S2_R1_001  -0.75566949  0.32130679  0.381780630 -0.4732351
## 1a4_q_005_S4_R1_001  -0.83250586 -0.12224420 -0.344503252 -0.7526523
## 1a6_q_007_S6_R1_001  -0.06230493  0.65223688 -0.040911215  1.1752934
## 1b2_q_013_S10_R1_001  0.16136731  0.86825638  0.010317928  0.7885374
## 1b4_q_015_S12_R1_001  0.12967927  0.76800694 -0.003837876  1.2768045
##                                5          6          7          8          9
## 1a1_q_002_S1_R1_001  -0.38370094 -0.1313879 -0.4915427 -0.3730555 -0.1059656
## 1a2_q_003_S2_R1_001   0.06193626 -0.5236392 -0.2458315 -0.7402790  0.4217254
## 1a4_q_005_S4_R1_001  -0.24566920 -0.1293080 -0.4491374 -0.8180449 -0.4273409
## 1a6_q_007_S6_R1_001   0.27824225 -0.4460591  0.1487576  0.7037006  0.3453350
## 1b2_q_013_S10_R1_001  0.68953710 -0.6315544  0.3617727  0.5376185  1.2756411
## 1b4_q_015_S12_R1_001  0.59780985 -0.7326287  0.1385924  0.4173258  1.2709022
##                              10          11          12         13
## 1a1_q_002_S1_R1_001  -0.4651650  0.74986457  0.33568695  0.5109533
## 1a2_q_003_S2_R1_001  -0.2048314  0.03310414 -0.28147008  1.8647825
## 1a4_q_005_S4_R1_001  -0.5358867 -0.03666176  0.09234978  1.0800513
## 1a6_q_007_S6_R1_001   0.7814266 -0.44958670  0.57106597 -0.2548949
## 1b2_q_013_S10_R1_001  1.7723968 -0.11230894 -0.46208247 -0.1866692
## 1b4_q_015_S12_R1_001  1.3563270 -0.24063026 -0.56602888 -0.2174072
##                                 14         15
## 1a1_q_002_S1_R1_001  -0.0798020913  0.1730526
## 1a2_q_003_S2_R1_001  -0.0626269210  0.4743356
## 1a4_q_005_S4_R1_001  -0.1411833436  0.4841148
## 1a6_q_007_S6_R1_001  -0.0003048739 -0.5710707
## 1b2_q_013_S10_R1_001  0.0193753243 -0.5400588
## 1b4_q_015_S12_R1_001  0.0260617892 -0.6416530

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.diurnal34.any.trt.15 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.diurnal34.any.trt.15.num,by="cluster") %>%
    mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>%  dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)]) 
# plot
p15.diurnal34.any.trt<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) + 
  geom_jitter(alpha=0.2) + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
  facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
  labs(title= "K-means clustering of diurnal DEGs (day 3 and 4): fifteen clusters",color = "Cluster",y="scaled expression level") 
p15.diurnal34.any.trt

ggsave(p15.diurnal34.any.trt,file="../output/diurnal34.any.trt.DEG.Kmean.15clusters.png",width=11,height=15)

Diurnal DEGS (13/14 days) clustering and cluster ORAs

dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno %>% head()
# scaling expression data
cpm.timecourse.v3.0.scale %>% head()
# diurnal 3and4 days DEG expression data (scaled)
cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG<-cpm.timecourse.v3.0.scale  %>% 
  inner_join(dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_day %in% c("13","14")) #[1] 11886  121
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
with(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG,table(sampling_day,sample)) # OK
##             sample
## sampling_day 1a5_q_006_S5_R1_001 1b1_q_012_S9_R1_001 1b3_q_014_S11_R1_001
##           13                 415                   0                  415
##           14                   0                 415                    0
##             sample
## sampling_day 1b6_q_017_S14_R1_001 1b7_q_020_S15_R1_001 1c2_q_024_S18_R1_001
##           13                    0                  415                    0
##           14                  415                    0                  415
##             sample
## sampling_day 1c3_q_025_S19_R1_001 1c4_q_026_S20_R1_001 1c6_q_028_S22_R1_001
##           13                    0                    0                    0
##           14                  415                  415                  415
##             sample
## sampling_day 1d1_q_035_S25_R1_001 1d4_q_040_S28_R1_001 1d5_q_042_S29_R1_001
##           13                    0                  415                  415
##           14                  415                    0                    0
##             sample
## sampling_day 1d7_q_045_S31_R1_001 1d8_q_046_S32_R1_001 1e1_q_048_S33_R1_001
##           13                    0                  415                  415
##           14                  415                    0                    0
##             sample
## sampling_day 1e3_q_053_S35_R1_001 1e4_q_055_S36_R1_001 1e7_q_058_S39_R1_001
##           13                  415                    0                    0
##           14                    0                  415                  415
##             sample
## sampling_day 1f1_q_060_S41_R1_001 1f6_q_070_S46_R1_001 1g1_q_073_S49_R1_001
##           13                  415                  415                    0
##           14                    0                    0                  415
##             sample
## sampling_day 1g2_q_074_S50_R1_001 1g4_q_077_S52_R1_001 1g5_q_080_S53_R1_001
##           13                  415                  415                  415
##           14                    0                    0                    0
##             sample
## sampling_day 1g6_q_081_S54_R1_001 1g8_q_083_S56_R1_001 1h1_q_084_S57_R1_001
##           13                    0                    0                    0
##           14                  415                  415                  415
##             sample
## sampling_day 1h2_q_085_S58_R1_001 1h8_q_097_S64_R1_001 1i3_q_105_S67_R1_001
##           13                  415                    0                    0
##           14                    0                  415                  415
##             sample
## sampling_day 1i5_q_107_S69_R1_001 1i7_q_110_S71_R1_001 1j3_q_114_S75_R1_001
##           13                    0                    0                  415
##           14                  415                  415                    0
##             sample
## sampling_day 1k1_q_120_S81_R1_001 1k3_q_123_S83_R1_001 1k5_q_128_S85_R1_001
##           13                  415                  415                  415
##           14                    0                    0                    0
##             sample
## sampling_day 1l2_q_137_S90_R1_001 1l3_q_138_S91_R1_001 1l5_q_141_S93_R1_001
##           13                  415                  415                    0
##           14                    0                    0                  415
##             sample
## sampling_day 1l6_q_142_S94_R1_001 2a2_q_147_S98_R1_001 2a3_q_148_S99_R1_001
##           13                    0                  415                  415
##           14                  415                    0                    0
##             sample
## sampling_day 2a8_q_154_S104_R1_001 2b1_q_156_S105_R1_001 2b2_q_158_S106_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 2b4_q_161_S108_R1_001 2b5_q_162_S109_R1_001 2b6_q_164_S110_R1_001
##           13                     0                   415                     0
##           14                   415                     0                   415
##             sample
## sampling_day 2c1_q_168_S113_R1_001 2c2_q_169_S114_R1_001 2c5_q_173_S117_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 2c6_q_175_S118_R1_001 2d4_q_183_S124_R1_001 2d6_q_185_S126_R1_001
##           13                     0                     0                     0
##           14                   415                   415                   415
##             sample
## sampling_day 2d8_q_190_S128_R1_001 2e1_q_193_S129_R1_001 2e3_q_198_S131_R1_001
##           13                   415                     0                   415
##           14                     0                   415                     0
##             sample
## sampling_day 2f1_q_204_S137_R1_001 2f3_q_206_S139_R1_001 2f5_q_211_S141_R1_001
##           13                     0                   415                     0
##           14                   415                     0                   415
##             sample
## sampling_day 2g2_q_219_S146_R1_001 2g6_q_227_S150_R1_001 2g8_q_229_S152_R1_001
##           13                   415                     0                   415
##           14                     0                   415                     0
##             sample
## sampling_day 2h2_q_231_S154_R1_001 2h3_q_232_S155_R1_001 2h6_q_237_S158_R1_001
##           13                     0                   415                     0
##           14                   415                     0                   415
##             sample
## sampling_day 2h7_q_238_S159_R1_001 2i2_q_243_S162_R1_001 2i3_q_245_S163_R1_001
##           13                     0                   415                   415
##           14                   415                     0                     0
##             sample
## sampling_day 2i4_q_246_S164_R1_001 2i7_q_248_S167_R1_001 2j2_q_252_S170_R1_001
##           13                   415                     0                   415
##           14                     0                   415                     0
##             sample
## sampling_day 2j3_q_253_S171_R1_001 2j8_q_261_S176_R1_001 2k1_q_263_S177_R1_001
##           13                     0                   415                   415
##           14                   415                     0                     0
##             sample
## sampling_day 2k2_q_265_S178_R1_001 2k4_q_267_S180_R1_001 2l2_q_278_S186_R1_001
##           13                     0                     0                   415
##           14                   415                   415                     0
##             sample
## sampling_day 2l7_q_287_S191_R1_001 2l8_q_288_S192_R1_001 3a3_q_293_S195_R1_001
##           13                     0                     0                   415
##           14                   415                   415                     0
##             sample
## sampling_day 3a5_q_295_S197_R1_001 3a7_q_297_S199_R1_001 3a8_q_299_S200_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 3b1_q_300_S201_R1_001 3b3_q_302_S203_R1_001 3b4_q_303_S204_R1_001
##           13                     0                     0                     0
##           14                   415                   415                   415
##             sample
## sampling_day 3b6_q_306_S206_R1_001 3c2_q_312_S210_R1_001 3c8_q_323_S216_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 3d1_q_324_S217_R1_001 3d2_q_325_S218_R1_001 3d3_q_326_S219_R1_001
##           13                     0                   415                   415
##           14                   415                     0                     0
##             sample
## sampling_day 3d4_q_329_S220_R1_001 3e1_q_339_S225_R1_001 3e5_q_348_S229_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 3e7_q_351_S231_R1_001 3e8_q_352_S232_R1_001 3f5_q_358_S237_R1_001
##           13                     0                   415                     0
##           14                   415                     0                   415
##             sample
## sampling_day 3f6_q_359_S238_R1_001 3g1_q_362_S241_R1_001 3g3_q_365_S243_R1_001
##           13                     0                     0                     0
##           14                   415                   415                   415
##             sample
## sampling_day 3g6_q_369_S246_R1_001 3g7_q_370_S247_R1_001 3g8_q_371_S248_R1_001
##           13                   415                     0                   415
##           14                     0                   415                     0
##             sample
## sampling_day 3h1_q_372_S249_R1_001 3h4_q_376_S252_R1_001 3h6_q_378_S254_R1_001
##           13                   415                     0                   415
##           14                     0                   415                     0
##             sample
## sampling_day 3i5_q_393_S261_R1_001 3i8_q_397_S264_R1_001 3j1_q_398_S265_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 3j2_q_399_S266_R1_001 3j5_q_405_S269_R1_001 3j8_q_410_S272_R1_001
##           13                     0                     0                   415
##           14                   415                   415                     0
##             sample
## sampling_day 3k2_q_412_S274_R1_001 3k3_q_413_S275_R1_001 3k4_q_414_S276_R1_001
##           13                   415                   415                     0
##           14                     0                     0                   415
##             sample
## sampling_day 3k8_q_420_S280_R1_001 3l6_q_428_S286_R1_001 3l8_q_432_S288_R1_001
##           13                     0                   415                     0
##           14                   415                     0                   415
# spread"
cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread<-cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread) # [1] 11886  121
## [1] 415 121
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1],2,var))
for (i in 2:30) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1],
                                     centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

Let’s perform the actual clsutering using K=6:

set.seed(20)
kClust.diurnal1314.any.trt.6 <- kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
kClusters.diurnal1314.any.trt.6 <- kClust.diurnal1314.any.trt.6$cluster
# number of clusters
cluster.diurnal1314.any.trt.6.num<-tibble(cluster=kClusters.diurnal1314.any.trt.6) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.diurnal1314.any.trt.6.num$cluster<-as.character(cluster.diurnal1314.any.trt.6.num$cluster) # classic way
cluster.diurnal1314.any.trt.6.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

kClustcentroids.diurnal1314.any.trt.6 <- sapply(levels(factor(kClusters.diurnal1314.any.trt.6)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], kClusters.diurnal1314.any.trt.6)
kClustcentroids.diurnal1314.any.trt.6 %>% head()
##                                1          2           3           4           5
## 1a5_q_006_S5_R1_001  -0.27278698  0.3226640  0.02428788 -0.22817029 -0.09108589
## 1b1_q_012_S9_R1_001  -0.09515938  0.3911190 -0.06995477  0.69649873 -0.56483373
## 1b3_q_014_S11_R1_001 -0.02012877  0.8087725 -0.09366928  0.40835854 -0.02250425
## 1b6_q_017_S14_R1_001 -0.06340790  1.0743089 -0.07322537 -0.04130941 -0.14094087
## 1b7_q_020_S15_R1_001  0.08215561 -0.2202796 -0.09172485  0.94910468 -0.48629465
## 1c2_q_024_S18_R1_001 -0.18829890  0.2982033 -0.13371672  0.29951480 -0.47074934
##                                6
## 1a5_q_006_S5_R1_001   0.75590838
## 1b1_q_012_S9_R1_001  -0.28492633
## 1b3_q_014_S11_R1_001  0.50962342
## 1b6_q_017_S14_R1_001  0.58617165
## 1b7_q_020_S15_R1_001 -0.20442966
## 1c2_q_024_S18_R1_001 -0.07883211

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.diurnal1314.any.trt.6 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.diurnal1314.any.trt.6.num,by="cluster") %>%
    mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>%  dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)]) 
# plot
p6.diurnal1314.any.trt<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) + 
  geom_jitter(alpha=0.5,width=0.25) + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
  facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
  labs(title= "K-means clustering of diurnal DEGs (day 13 and 14): six clusters",color = "Cluster",y="scaled expression level") 
p6.diurnal1314.any.trt

ggsave(p6.diurnal1314.any.trt,file="../output/diurnal1314.any.trt.DEG.Kmean.6clusters.png",width=11,height=6) # 15 clusters... why????

Let’s perform the actual clsutering using K=15:

set.seed(20)
kClust.diurnal1314.any.trt.15 <- kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.diurnal1314.any.trt.15 <- kClust.diurnal1314.any.trt.15$cluster
# number of clusters
cluster.diurnal1314.any.trt.15.num<-tibble(cluster=kClusters.diurnal1314.any.trt.15) %>% group_by(cluster) %>% summarize(n=n()) 
cluster.diurnal1314.any.trt.15.num$cluster<-as.character(cluster.diurnal1314.any.trt.15.num$cluster) # classic way
cluster.diurnal1314.any.trt.15.num

Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i

kClustcentroids.diurnal1314.any.trt.15 <- sapply(levels(factor(kClusters.diurnal1314.any.trt.15)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], kClusters.diurnal1314.any.trt.15)
kClustcentroids.diurnal1314.any.trt.15 %>% head()
##                               1           2           3           4           5
## 1a5_q_006_S5_R1_001  0.04742393 -0.09217499  0.06968819 -0.01939530  0.37585514
## 1b1_q_012_S9_R1_001  1.03158347 -0.33091601  0.06125328 -0.10626701  0.06704349
## 1b3_q_014_S11_R1_001 0.35433674  0.19127711  0.59181089 -0.07592017 -0.06355553
## 1b6_q_017_S14_R1_001 0.09880705  0.86363272  0.31282606 -0.04667052  0.57680721
## 1b7_q_020_S15_R1_001 1.32429031 -0.70446229 -0.41644143 -0.11020982 -0.50597283
## 1c2_q_024_S18_R1_001 0.35385751 -0.50915924  0.10693178 -0.12079579  0.19183548
##                               6          7           8          9         10
## 1a5_q_006_S5_R1_001  -0.2946402 -0.2663661 -0.44336291  0.8137632  0.1135254
## 1b1_q_012_S9_R1_001  -0.6416759  0.5529310 -0.22453618  0.5023845 -0.5078942
## 1b3_q_014_S11_R1_001 -0.2651513  0.4863177 -0.04005012  0.5536828  0.3295724
## 1b6_q_017_S14_R1_001 -0.2105430 -0.4760724 -0.15349776  1.8412404 -0.1851687
## 1b7_q_020_S15_R1_001 -0.5645009  0.4739545 -0.12256037 -0.1077408 -0.3537980
## 1c2_q_024_S18_R1_001 -0.4589185  0.1999366 -0.11381218  0.3331559 -0.5232549
##                              11         12          13        14          15
## 1a5_q_006_S5_R1_001   0.6889138  0.9788134 -0.45833208 0.9661078 -0.38559244
## 1b1_q_012_S9_R1_001   0.6638472 -0.4843574 -0.13231957 0.2522155  0.90530279
## 1b3_q_014_S11_R1_001 -0.7114466  0.5190911  0.10275010 1.2747567  0.44640559
## 1b6_q_017_S14_R1_001  0.3208429  0.1038389  0.03302613 1.3564667 -0.02622145
## 1b7_q_020_S15_R1_001  0.7455260 -0.2624090 -0.08730064 0.3794383  1.34149861
## 1c2_q_024_S18_R1_001 -0.6527750 -0.1144491  0.10468412 0.0556180  0.33083885

Plotting the centroids to see how they behave: tidyverse version

# adding sample description to data
  data.sample<-kClustcentroids.diurnal1314.any.trt.15 %>% as_tibble(rownames="sample") %>% 
  gather(cluster,value,-1) %>% 
  inner_join(sample.description.timecourse,by="sample") %>% 
  inner_join(cluster.diurnal1314.any.trt.15.num,by="cluster") %>%
    mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
                          cluster2=cluster,
                          n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
  data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>%  inner_join(
    data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>%  dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)]) 
# plot
p15.diurnal1314.any.trt<-  ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) + 
  geom_jitter(alpha=0.2) + geom_hline(yintercept=0,color="red") + 
  geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
  facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
  labs(title= "K-means clustering of diurnal DEGs (day 13 and 14): fifteen clusters",color = "Cluster",y="scaled expression level") 
p15.diurnal1314.any.trt

ggsave(p15.diurnal1314.any.trt,file="../output/diurnal1314.any.trt.DEG.Kmean.15clusters.png",width=11,height=15)

GO ORA under construction (day 3/4)

GO ORA under construction (day 13/14)

session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] annotate_1.64.0             XML_3.99-0.3               
##  [3] GO.db_3.10.0                AnnotationDbi_1.48.0       
##  [5] goseq_1.38.0                geneLenDataBase_1.22.0     
##  [7] BiasedUrn_1.07              ShortRead_1.44.3           
##  [9] GenomicAlignments_1.22.1    SummarizedExperiment_1.16.1
## [11] DelayedArray_0.12.2         matrixStats_0.55.0         
## [13] Biobase_2.46.0              Rsamtools_2.2.3            
## [15] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
## [17] Biostrings_2.54.0           XVector_0.26.0             
## [19] IRanges_2.20.2              S4Vectors_0.24.3           
## [21] BiocParallel_1.20.1         BiocGenerics_0.32.0        
## [23] cowplot_1.0.0               glue_1.3.1                 
## [25] readxl_1.3.1                forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_0.8.4                
## [29] purrr_0.3.3                 readr_1.3.1                
## [31] tidyr_1.0.2                 tibble_2.1.3               
## [33] ggplot2_3.2.1               tidyverse_1.3.0            
## [35] edgeR_3.28.1                limma_3.42.2               
## [37] knitr_1.28                 
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1       hwriter_1.3.2          ellipsis_0.3.0        
##  [4] fs_1.3.1               rstudioapi_0.11        farver_2.0.3          
##  [7] bit64_0.9-7            fansi_0.4.1            lubridate_1.7.4       
## [10] xml2_1.2.2             splines_3.6.2          jsonlite_1.6.1        
## [13] broom_0.5.5            dbplyr_1.4.2           png_0.1-7             
## [16] compiler_3.6.2         httr_1.4.1             backports_1.1.5       
## [19] assertthat_0.2.1       Matrix_1.2-18          lazyeval_0.2.2        
## [22] cli_2.0.2              htmltools_0.4.0        prettyunits_1.1.1     
## [25] tools_3.6.2            gtable_0.3.0           GenomeInfoDbData_1.2.2
## [28] reshape2_1.4.3         rappdirs_0.3.1         Rcpp_1.0.3            
## [31] cellranger_1.1.0       vctrs_0.2.3            nlme_3.1-144          
## [34] rtracklayer_1.46.0     xfun_0.12              rvest_0.3.5           
## [37] lifecycle_0.1.0        zlibbioc_1.32.0        scales_1.1.0          
## [40] hms_0.5.3              RColorBrewer_1.1-2     curl_4.3              
## [43] yaml_2.2.1             memoise_1.1.0          biomaRt_2.42.0        
## [46] latticeExtra_0.6-29    stringi_1.4.6          RSQLite_2.2.0         
## [49] GenomicFeatures_1.38.2 rlang_0.4.5            pkgconfig_2.0.3       
## [52] bitops_1.0-6           evaluate_0.14          lattice_0.20-40       
## [55] labeling_0.3           bit_1.1-15.2           tidyselect_1.0.0      
## [58] plyr_1.8.5             magrittr_1.5           R6_2.4.1              
## [61] generics_0.0.2         DBI_1.1.0              mgcv_1.8-31           
## [64] pillar_1.4.3           haven_2.2.0            withr_2.1.2           
## [67] RCurl_1.98-1.1         modelr_0.1.6           crayon_1.3.4          
## [70] BiocFileCache_1.10.2   rmarkdown_2.1          jpeg_0.1-8.1          
## [73] progress_1.2.2         locfit_1.5-9.1         grid_3.6.2            
## [76] blob_1.2.1             reprex_0.3.0           digest_0.6.25         
## [79] xtable_1.8-4           openssl_1.4.1          munsell_0.5.0         
## [82] askpass_1.1